# coding: utf-8
# 文件名称: 古诗文_spider.py
# 创建时间: 2021/6/13 23:08

import requests
import re
from lxml import etree

def parse_page(url):
    headers = {
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41'
    }
    result = requests.get(url, headers=headers)
    text = result.text

    html = etree.HTML(text)
    # print(etree.tostring(html, encoding='utf8').decode('utf8'))
    left_div = html.xpath('//div[@id="leftZhankai"]')[0]
    divs = left_div.xpath('.//div[@class="sons"]')
    lis = []
    for div in divs:
        info = {}
        title = div.xpath('.//b/text()')[0]
        info['标题'] = title
        zuoze = div.xpath('.//p[@class="source"]/a[1]/text()')[0]
        info['作者'] = zuoze
        chaodai = div.xpath('.//p[@class="source"]/a[2]/text()')[0]
        ret = re.match('〔(\w+)〕', chaodai)
        info['朝代'] = ret.group(1)
        con = div.xpath('.//div[@class="contson"]//text()')
        info['内容'] = con
 
        lis.append(info)
    print(lis)

def main():
    url = 'https://so.gushiwen.cn/shiwens/default.aspx?page=1&tstr=&astr=&cstr=&xstr='
    parse_page(url)

if __name__ == '__main__':
    main()
